sdm_1

Goals

Create initial SDM using:

  • Observations
    Species with most OBIS observations. Later try temporally subsetting to look for migratory patterns. And run for all.

  • Environment
    Environmental data from sdmpredictors. Need to know whether demersal or pelagic. For now extract static climatic environmental predictors for model fitting and prediction. Later extract dynamic predictors synchronous in time for model fitting and predict with hind-/now-/fore-cast or climatic seasonal snapshot.

  • Model
    Use Maxent.

References

Setup

# libraries ----
options(java.parameters = "-Xmx8000m")
librarian::shelf(
  arrow, dismo, dplyr, DT, glue, here, leafem, 
  leaflet.extras, readr, rJava, robis, sdmpredictors, terra, sf, mapview,
  quiet = T)
Warning: package 'arrow' was built under R version 4.3.1
Warning: package 'leafem' was built under R version 4.3.1
options(readr.show_col_types = F)

# variables -----
dir_data <- "/Users/bbest/My Drive/projects/mbon-sdm/data"
obis_prq <- glue("{dir_data}/raw/obis.org/obis_20230726.parquet")

# functions -----
get_sp_occ_obis_prq <- function(
    aphia_id,
    obis_prq  = "/Users/bbest/My Drive/projects/mbon-sdm/data/raw/obis.org/obis_20230726.parquet",
    cols_keep = c(
      "id",
      "phylum",
      "class",
      "taxonRank",
      "scientificName",
      "AphiaID",
      "date_mid",
      "decimalLongitude",
      "decimalLatitude",
      "depth",
      "individualCount",
      "flags")) {
  # get species occurrences from OBIS parquet file
  
  # TODO: add caching per species request/args
  
  # return only observations with valid coordinates and year
  o <- open_dataset(obis_prq) |> 
    filter(
      !is.na(date_mid),
      !is.na(decimalLongitude),
      !is.na(decimalLatitude),
      AphiaID == !!aphia_id) |> 
    select(all_of(cols_keep)) |> 
    collect() |>
    mutate(
      date_mid =as.POSIXct(
        date_mid/1000, origin = "1970-01-01",tz = "GMT") |> 
        as.Date()) |> 
    st_as_sf(
      coords = c("decimalLongitude", "decimalLatitude"), 
      crs = 4326)
}

Species candidates

# species with most observations
sp_gull <- read_csv("data/obis_top-species.csv") |> 
  arrange(desc(n_obs)) |>
  slice(1)
  
# non-bird species with most observations
sp_herring <- read_csv("data/obis_top-species.csv") |> 
  filter(class != "Aves") |> 
  arrange(desc(n_obs)) |>
  slice(1)

# species with ~100 observations (and most in Class)
sp_jelly <- read_csv("data/obis_top-species.csv") |> 
  filter(n_obs > 240) |> 
  arrange(n_obs) |>
  slice(1)

obs: N. Atlantic right whale

N Atlantic right whale (Eubalaena glacialis)

# right whale: surface, migratory, endangered
sp_rwhale <- tibble(
  AphiaID = 159023)

# prep single species observations ---
sp       <- sp_rwhale
aphia_id <- sp$AphiaID

# get observations
obs <- get_sp_occ_obis_prq(aphia_id)
obs
Simple feature collection with 7359 features and 10 fields
Geometry type: POINT
Dimension:     XY
Bounding box:  xmin: -93.3655 ymin: -33.88334 xmax: 153.56 ymax: 71.38
Geodetic CRS:  WGS 84
# A tibble: 7,359 × 11
   id             phylum class taxonRank scientificName AphiaID date_mid   depth
 * <chr>          <chr>  <chr> <chr>     <chr>            <int> <date>     <dbl>
 1 6ed27cbe-b7d5… Chord… Mamm… Species   Eubalaena gla…  159023 2009-08-28    NA
 2 6ef3fd6f-1396… Chord… Mamm… Species   Eubalaena gla…  159023 2006-08-18    NA
 3 6ef6f965-ddb3… Chord… Mamm… Species   Eubalaena gla…  159023 2019-08-09    NA
 4 6f3607da-d76f… Chord… Mamm… Species   Eubalaena gla…  159023 2006-07-15    NA
 5 6f4d9844-ae21… Chord… Mamm… Species   Eubalaena gla…  159023 2002-09-19    NA
 6 6f5eca54-a19b… Chord… Mamm… Species   Eubalaena gla…  159023 2001-09-19    NA
 7 6f88b96c-61cd… Chord… Mamm… Species   Eubalaena gla…  159023 2009-08-25    NA
 8 6f8dea57-f999… Chord… Mamm… Species   Eubalaena gla…  159023 2002-08-30    NA
 9 6f93b0d9-e829… Chord… Mamm… Species   Eubalaena gla…  159023 2006-09-05    NA
10 6f971a91-e863… Chord… Mamm… Species   Eubalaena gla…  159023 2007-09-17    NA
# ℹ 7,349 more rows
# ℹ 3 more variables: individualCount <chr>, flags <chr>, geometry <POINT [°]>
# plot observations
mapView(obs) %>%
  .@map |> 
  leaflet.extras::addFullscreenControl()

env: sdmpredictors

librarian::shelf(
  sdmpredictors, skimr, terra)

  The 'cran_repo' argument in shelf() was not set, so it will use
  cran_repo = 'https://cran.r-project.org' by default.

  To avoid this message, set the 'cran_repo' argument to a CRAN
  mirror URL (see https://cran.r-project.org/mirrors.html) or set
  'quiet = TRUE'.
# see ../aquamaps-downscaled/index.qmd for creation
env_tif  <- here("../aquamaps-downscaled/data/bio-oracle.tif")

stopifnot(file.exists(env_tif))

env <- rast(env_tif)
names(env) <- c(
  "Temp",
  "Salinity",
  "PrimProd",
  "IceCon")
plot(env)

obs_env <- extract(env, obs, cells=T) |> 
  as_tibble() |> 
  mutate(
    presence = 1)

bk_cells <- setdiff(cells(env), obs_env$cell)
bk_env <- tibble(
  presence = 0,
  cell     = bk_cells) |> 
  bind_cols(
    values(env)[bk_cells,]) |> 
  distinct(pick(all_of(names(env))), .keep_all = T)
# rows: 6,181,644 -> 6,178,309 after distinct() and na.omit()

set.seed(42) 

d_env <- bind_rows(
  obs_env,
  bk_env |> 
    slice(sample(1:nrow(bk_env), nrow(obs_env)) ) ) |> 
  select(
    presence, cell,
    all_of(names(env))) |> 
  na.omit()

tail(d_env)
# A tibble: 6 × 6
  presence    cell  Temp Salinity PrimProd   IceCon
     <dbl>   <dbl> <dbl>    <dbl>    <dbl>    <dbl>
1        0 4516596 28.0      34.5  0.00714 0       
2        0 5068209 25.4      35.2  0.0119  0       
3        0 7990275 -1.07     33.9  0.00173 0.422   
4        0 7580430  1.28     33.8  0.00241 0.000265
5        0 7133196  8.96     34.2  0.00388 0       
6        0 4203555 27.7      34.4  0.00419 0       
# table(d_env$presence)
#       0       1 
# 6178319    7355
#    0    1 
# 7359 7355 

d_env
# A tibble: 14,714 × 6
   presence    cell  Temp Salinity PrimProd     IceCon
      <dbl>   <dbl> <dbl>    <dbl>    <dbl>      <dbl>
 1        1 2334160  8.54     30.0  0.0270  0.00000500
 2        1 2351442  8.72     30.5  0.0169  0.00000200
 3        1 2187314  7.11     27.3  0.0137  0.0297    
 4        1 2442173  9.11     31.0  0.00347 0.00000200
 5        1 2364405  8.77     30.6  0.0119  0.00000100
 6        1 2364405  8.77     30.6  0.0119  0.00000100
 7        1 2334158  8.48     30.1  0.0253  0.00000600
 8        1 2351443  8.75     30.5  0.0162  0.00000200
 9        1 2355762  8.73     30.7  0.0144  0.00000100
10        1 2442178  9.32     30.9  0.00338 0.00000200
# ℹ 14,704 more rows

model: predicts::maxent()

example

librarian::shelf(predicts)

  The 'cran_repo' argument in shelf() was not set, so it will use
  cran_repo = 'https://cran.r-project.org' by default.

  To avoid this message, set the 'cran_repo' argument to a CRAN
  mirror URL (see https://cran.r-project.org/mirrors.html) or set
  'quiet = TRUE'.
Warning: package 'predicts' was built under R version 4.3.1
#?predicts::MaxEnt
# MaxEnt()
# This is MaxEnt_model version 3.4.3

# get predictor variables
f <- system.file("ex/bio.tif", package="predicts")
preds <- rast(f)
plot(preds)

# file with presence points
occurence <- system.file("/ex/bradypus.csv", package="predicts")
occ <- read.csv(occurence)[,-1]

# witholding a 20% sample for testing 
fold     <- folds(occ, k=5)
occtest  <- occ[fold == 1, ]
occtrain <- occ[fold != 1, ]

# fit model
me <- MaxEnt(preds, occtrain)

# see the MaxEnt results in a browser:
me
class    : MaxEnt_model 
variables: bio1 bio5 bio6 bio7 bio8 bio9 bio12 bio16 bio17 
# plot showing importance of each variable
plot(me, main="me: Variable contribution")

# TODO: try categorical, not in preds
# # use "args"
# me2 <- MaxEnt(preds, occtrain, factors='biome', args=c("-J", "-P"))
# 
# # plot showing importance of each variable
# plot(me2, main="me2: Variable contribution")

# response curves
library(ggplot2)

d <- tibble()
for (v in names(preds)){  # v = names(preds)[2]
  pr <- partialResponse(me, var=v)
  d <- bind_rows(
    d, 
    pr |> 
      rename(value = 1) |> 
      mutate(
        var = v))
  # plot(pr, type="l", las=1)
  # TODO: lattice ggplot
}
g <- d |> 
  ggplot(aes(x = value, y = p)) +
  geom_line() +
  facet_wrap(
    ~var, scales = "free") +
  theme_bw()
g

plotly::ggplotly(g)
# TODO: convert to function

# pr2 <- partialResponse2(me, var="bio1", var2 = "bio5")
# plot(pr2, type="l", las=1)

# predict to entire dataset
r <- predict(me, preds) 
plot(r)
points(occ)

# with some options:
r <- predict(me, preds, args=c("outputformat=raw"))
plot(r)
points(occ)

rwhale

# get predictor variables
preds <- env
plot(preds)

# witholding a 20% sample for testing 
occ <- obs |> 
  mutate(
    lon = st_coordinates(geometry)[,1],
    lat = st_coordinates(geometry)[,2]) |>
  st_drop_geometry() |> 
  select(lon, lat)
fold     <- folds(occ, k=5)
occtest  <- occ[fold == 1, ]
occtrain <- occ[fold != 1, ]

# fit model
system.time({
  me <- MaxEnt(preds, occtrain)
})
Warning in .local(x, p, ...): 2 (0.12%) of the presence points have NA
predictor values
   user  system elapsed 
 76.231   0.522  76.635 
# Warning message:
# In .local(x, p, ...) :
#   3 (0.19%) of the presence points have NA predictor values


# see the MaxEnt results in a browser:
me
class    : MaxEnt_model 
variables: Temp Salinity PrimProd IceCon 
# plot showing importance of each variable
plot(me, main="me: Variable contribution")

# TODO: try categorical, not in preds
# # use "args"
# me2 <- MaxEnt(preds, occtrain, factors='biome', args=c("-J", "-P"))
# 
# # plot showing importance of each variable
# plot(me2, main="me2: Variable contribution")

# response curves
library(ggplot2)

system.time({
d <- tibble()
for (v in names(preds)){  # v = names(preds)[2]
  pr <- partialResponse(me, var=v)
  d <- bind_rows(
    d, 
    pr |> 
      rename(value = 1) |> 
      mutate(
        var = v))
  # plot(pr, type="l", las=1)
  # TODO: lattice ggplot
}
g <- d |> 
  ggplot(aes(x = value, y = p)) +
  geom_line() +
  facet_wrap(
    ~var, scales = "free") +
  theme_bw()
g
})
   user  system elapsed 
 18.749   0.200  18.875 
# plotly::ggplotly(g)
# TODO: convert to function

# pr2 <- partialResponse2(me, var="bio1", var2 = "bio5")
# plot(pr2, type="l", las=1)

# predict to entire dataset
system.time({
  r <- predict(me, preds)
})
   user  system elapsed 
 85.583   1.209  87.416 
plot(r)
points(occ)

# with some options:
# system.time({
#   r <- predict(me, preds, args=c("outputformat=raw"))
# })
# plot(r)
# points(occ)

mdl: maxnet

librarian::shelf(
  maxnet)

system.time({
  m <- maxnet(
    p    = d_env |> pull(presence), 
    data = d_env |> select(-presence, -cell) |> as.data.frame())
}) # present/absent:7,359/7,355: 50.149s

plot(m, type="cloglog")

predict(m, new)

env |> terra::add()

env
# 2160 * 4320 = 9331200, 5
d_env <- values(env, dataframe = T, na.rm=F)
# d_env <- d_env |> 
#   select(-pred)
dim(d_env) # 9,331,200       4

ncell(env) # 9,331,200

d_env_notna <- na.omit(d_env)
dim(d_env_notna) # 6,183,457

x <- d_env_notna

intersect(attr(x, "na.action"), attr(x, "row.names"))
class(attr(x, "row.names"))


p <- predict(m, d_env_notna, clamp = T, type = "logistic") #, clamp=T, type=c("link","exponential","cloglog","logistic"), ...)
length(p[,1]) # 6,183,457

p <- predict(m, raster::stack(env), clamp = F, type = "logistic") #, clamp=T,

env$pred <- NA
env$pred[attr(x, "row.names")] <- p[,1]

plot(env$pred)
setValues(env, p)

# why are these different lengths?
as.numeric(p)
ncell(env)

length(cells(env))
length(as.numeric(p))

plot(env$pred)
mod <- maxnet(p, data, maxnet.formula(p, data, classes="lq"))
plot(mod, "tmp6190_ann")

vs Kernel Density Estimates (KDE)

librarian::shelf(eks)

  The 'cran_repo' argument in shelf() was not set, so it will use
  cran_repo = 'https://cran.r-project.org' by default.

  To avoid this message, set the 'cran_repo' argument to a CRAN
  mirror URL (see https://cran.r-project.org/mirrors.html) or set
  'quiet = TRUE'.